Group 6 - Data Mining Project¶

Winter Term, 2024¶

Ariel Azria, Ashley Nicolas, Alex Yeager, Ce Zhang, Arpan Pradhan

Data Import, Cleaning, and Pre-processing¶

Import the Data¶

In [1]:
# Run this cell if using Google colab and delete the first and last rows
from google.colab import drive
drive.mount('/content/drive')

drive.mount('/content/gdrive')
drive_path = '/content/gdrive/Shareddrives/Data_Mining/Dataset/'
Mounted at /content/drive
Mounted at /content/gdrive
In [2]:
# Preliminary: libraries and data import

import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt
import sys

sys.path.append(drive_path)
import dmba

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Replace w/ the dataset here. Update working directory as necessary.
df = pd.read_csv(drive_path + "crx.data", header = None)
In [3]:
dmba
Out[3]:
<module 'dmba' from '/content/gdrive/Shareddrives/Data_Mining/Dataset/dmba.py'>
In [4]:
 # Seeing the data features
df

# Columns aren't named and the values are non-sensical.
Out[4]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
685 b 21.08 10.085 y p e h 1.25 f f 0 f g 00260 0 -
686 a 22.67 0.750 u g c v 2.00 f t 2 t g 00200 394 -
687 a 25.25 13.500 y p ff ff 2.00 f t 1 t g 00200 1 -
688 b 17.92 0.205 u g aa v 0.04 f f 0 f g 00280 750 -
689 b 35.00 3.375 u g c h 8.29 f f 0 t g 00000 0 -

690 rows × 16 columns

In [5]:
# First, let's rename the columns to be able to see what we're looking at in each column

# Dictionary for renaming columns
d = {0: 'Gender', 1: 'Age', 2: 'Debt', 3: 'Married', 4: 'BankCustomer',
     5: 'EducationLevel', 6: 'Ethnicity', 7: 'YearsEmployed', 8: 'PriorDefault',
     9: 'Employed', 10: 'CreditScore', 11: 'DriversLicense', 12: 'Citizen',
     13: 'ZipCode', 14: 'Income', 15: 'ApprovalStatus'}

# Rename columns using the dictionary
df.rename(columns=d, inplace=True)
In [6]:
# Dimensions of data
df.shape
Out[6]:
(690, 16)
In [7]:
df.size # size of data frame
Out[7]:
11040
In [8]:
df
Out[8]:
Gender Age Debt Married BankCustomer EducationLevel Ethnicity YearsEmployed PriorDefault Employed CreditScore DriversLicense Citizen ZipCode Income ApprovalStatus
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
685 b 21.08 10.085 y p e h 1.25 f f 0 f g 00260 0 -
686 a 22.67 0.750 u g c v 2.00 f t 2 t g 00200 394 -
687 a 25.25 13.500 y p ff ff 2.00 f t 1 t g 00200 1 -
688 b 17.92 0.205 u g aa v 0.04 f f 0 f g 00280 750 -
689 b 35.00 3.375 u g c h 8.29 f f 0 t g 00000 0 -

690 rows × 16 columns

In [9]:
# Count the number of '?' values in our dataset so they're not treated as a group by LabelEncoder and mess up our models
for col in df.columns:
    count = (df[col] == '?').sum()
    if count > 0:
        print(f"Column {col}: ? Count: {count}")
Column Gender: ? Count: 12
Column Age: ? Count: 12
Column Married: ? Count: 6
Column BankCustomer: ? Count: 6
Column EducationLevel: ? Count: 9
Column Ethnicity: ? Count: 9
Column ZipCode: ? Count: 13
In [10]:
# Replace all the '?' values with NA's
df.replace('?', np.nan, inplace=True)
In [11]:
# Check if the '?' values are gone
for col in df.columns:
    count = (df[col] == '?').sum()
    if count > 0:
        print(f"Column {col}:? Count: {count}")
    else:
      print(f"Column {col}: No ? values found.")
Column Gender: No ? values found.
Column Age: No ? values found.
Column Debt: No ? values found.
Column Married: No ? values found.
Column BankCustomer: No ? values found.
Column EducationLevel: No ? values found.
Column Ethnicity: No ? values found.
Column YearsEmployed: No ? values found.
Column PriorDefault: No ? values found.
Column Employed: No ? values found.
Column CreditScore: No ? values found.
Column DriversLicense: No ? values found.
Column Citizen: No ? values found.
Column ZipCode: No ? values found.
Column Income: No ? values found.
Column ApprovalStatus: No ? values found.
In [12]:
# Print a plot of all the new missing values that were plugged in instead of '?'
null_values = df.isnull().sum()

plt.figure(figsize=(12, 6))
null_values.plot(kind="bar")
plt.ylabel("Number of Null Values")
plt.title("Total Count of Null Values per Column")
plt.xticks(rotation=45)
plt.show()
In [13]:
df.dtypes
Out[13]:
Gender             object
Age                object
Debt              float64
Married            object
BankCustomer       object
EducationLevel     object
Ethnicity          object
YearsEmployed     float64
PriorDefault       object
Employed           object
CreditScore         int64
DriversLicense     object
Citizen            object
ZipCode            object
Income              int64
ApprovalStatus     object
dtype: object
In [14]:
# Convert age into a float type because it is being read as an object

df["Age"] = df["Age"].astype(float)
df.dtypes
Out[14]:
Gender             object
Age               float64
Debt              float64
Married            object
BankCustomer       object
EducationLevel     object
Ethnicity          object
YearsEmployed     float64
PriorDefault       object
Employed           object
CreditScore         int64
DriversLicense     object
Citizen            object
ZipCode            object
Income              int64
ApprovalStatus     object
dtype: object
In [15]:
# Replacing the NA values with mode and median
cat_cols = [0, 3, 4, 5, 6, 12, 13]

# Using mode for categorical variables
for col in cat_cols:
  mode = df.iloc[:,col].dropna().mode()
  if not mode.empty:
    df.iloc[:, col] = df.iloc[:,col].fillna(mode.iloc[0])

continuous_cols = [1, 2, 7, 14]

# Using median for continuous variables
for col in continuous_cols:
  median = df.iloc[:, col].median()
  df.iloc[:, col] = df.iloc[:, col].fillna(median)

df.head()
Out[15]:
Gender Age Debt Married BankCustomer EducationLevel Ethnicity YearsEmployed PriorDefault Employed CreditScore DriversLicense Citizen ZipCode Income ApprovalStatus
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +
In [16]:
df.dtypes
Out[16]:
Gender             object
Age               float64
Debt              float64
Married            object
BankCustomer       object
EducationLevel     object
Ethnicity          object
YearsEmployed     float64
PriorDefault       object
Employed           object
CreditScore         int64
DriversLicense     object
Citizen            object
ZipCode            object
Income              int64
ApprovalStatus     object
dtype: object
In [17]:
# Now, let's recode the values in the columns into numeric values so we can analyze data distributions
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in df.columns:
    if df[col].dtypes=='object':
      df[col] = df[col].astype(str)
      df[col]=le.fit_transform(df[col])
In [18]:
df
Out[18]:
Gender Age Debt Married BankCustomer EducationLevel Ethnicity YearsEmployed PriorDefault Employed CreditScore DriversLicense Citizen ZipCode Income ApprovalStatus
0 1 30.83 0.000 1 0 12 7 1.25 1 1 1 0 0 68 0 0
1 0 58.67 4.460 1 0 10 3 3.04 1 1 6 0 0 11 560 0
2 0 24.50 0.500 1 0 10 3 1.50 1 0 0 0 0 96 824 0
3 1 27.83 1.540 1 0 12 7 3.75 1 1 5 1 0 31 3 0
4 1 20.17 5.625 1 0 12 7 1.71 1 0 0 0 2 37 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
685 1 21.08 10.085 2 2 4 3 1.25 0 0 0 0 0 90 0 1
686 0 22.67 0.750 1 0 1 7 2.00 0 1 2 1 0 67 394 1
687 0 25.25 13.500 2 2 5 2 2.00 0 1 1 1 0 67 1 1
688 1 17.92 0.205 1 0 0 7 0.04 0 0 0 0 0 96 750 1
689 1 35.00 3.375 1 0 1 3 8.29 0 0 0 1 0 0 0 1

690 rows × 16 columns

In [19]:
# Print the unique values in each categorical column and the levels there are such as 2 levels for [0,1] and 4+ for education level
categorical_columns = ['Gender', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity',
                       'PriorDefault', 'Employed', 'DriversLicense', 'Citizen']
for col in categorical_columns:
  print(f"Column {col}")
  print("Levels:", df[col].unique())
  print("\n")
Column Gender
Levels: [1 0]


Column Married
Levels: [1 2 0]


Column BankCustomer
Levels: [0 2 1]


Column EducationLevel
Levels: [12 10  9 11  2  8  1  3 13  6  4  0  5  7]


Column Ethnicity
Levels: [7 3 0 2 4 8 6 1 5]


Column PriorDefault
Levels: [1 0]


Column Employed
Levels: [1 0]


Column DriversLicense
Levels: [0 1]


Column Citizen
Levels: [0 2 1]


In [20]:
# Great, this worked.
# Now we have data in each column that's numeric
df.describe().T
Out[20]:
count mean std min 25% 50% 75% max
Gender 690.0 0.695652 0.460464 0.00 0.000 1.00 1.0000 1.00
Age 690.0 31.514116 11.860245 13.75 22.670 28.46 37.7075 80.25
Debt 690.0 4.758725 4.978163 0.00 1.000 2.75 7.2075 28.00
Married 690.0 1.233333 0.430063 0.00 1.000 1.00 1.0000 2.00
BankCustomer 690.0 0.475362 0.850238 0.00 0.000 0.00 0.0000 2.00
EducationLevel 690.0 5.698551 4.285748 0.00 1.000 5.00 10.0000 13.00
Ethnicity 690.0 5.098551 2.510731 0.00 3.000 7.00 7.0000 8.00
YearsEmployed 690.0 2.223406 3.346513 0.00 0.165 1.00 2.6250 28.50
PriorDefault 690.0 0.523188 0.499824 0.00 0.000 1.00 1.0000 1.00
Employed 690.0 0.427536 0.495080 0.00 0.000 0.00 1.0000 1.00
CreditScore 690.0 2.400000 4.862940 0.00 0.000 0.00 3.0000 67.00
DriversLicense 690.0 0.457971 0.498592 0.00 0.000 0.00 1.0000 1.00
Citizen 690.0 0.176812 0.557869 0.00 0.000 0.00 0.0000 2.00
ZipCode 690.0 56.189855 46.386934 0.00 17.000 52.00 93.0000 169.00
Income 690.0 1017.385507 5210.102598 0.00 0.000 5.00 395.5000 100000.00
ApprovalStatus 690.0 0.555072 0.497318 0.00 0.000 1.00 1.0000 1.00
In [21]:
# Check for any last NA values
df.isnull().sum()
Out[21]:
Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
ApprovalStatus    0
dtype: int64
In [22]:
# It looks like we have 0s in almost every column, even if that column is continuous
# Let's start cleaning the data

Data Cleaning¶

In [23]:
# Plotting all the variables to see where 0s need to be removed
plt.figure(figsize=(12, 8))
for i, col in enumerate(df, start=1):
    plt.subplot(4, 4, i)
    sns.histplot(data=df, x=col, kde=True)
plt.tight_layout()
plt.show()

We decided not to remove zeros from the code. We already removed NA values, so we assume that the 0s in the categorical variables are categories rather than NAs.

In addition, since we're a community bank, we expect to see many zeros in years employed, as our clients may not have had traditional employment. In addition, the zip code data have been anonymized, so the number of 0s in this column is not concerning. We also expect some applicants to have a zero income, as this indicates they are unemployed.

Split Test and Train Set¶

In [24]:
# Starting with initial data set
X= df.drop(["Age", "Gender","DriversLicense", "ZipCode", "Ethnicity", "Citizen", "ApprovalStatus"], axis=1)
y=df["ApprovalStatus"]

# Scaling the data because there are different units and scales, as seen above
from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()
X_scaled = Scaler.fit_transform(X)

# Convert the scaled features back to a DataFrame
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_subset = pd.concat([df_scaled, y], axis=1)

# Check the first few rows of the new DataFrame
df_subset.head()
Out[24]:
Debt Married BankCustomer EducationLevel YearsEmployed PriorDefault Employed CreditScore Income ApprovalStatus
0 -0.956613 -0.54295 -0.559499 1.471393 -0.291083 0.95465 1.157144 -0.288101 -0.195413 0
1 -0.060051 -0.54295 -0.559499 1.004392 0.244190 0.95465 1.157144 0.740830 -0.087852 0
2 -0.856102 -0.54295 -0.559499 1.004392 -0.216324 0.95465 -0.864196 -0.493887 -0.037144 0
3 -0.647038 -0.54295 -0.559499 1.471393 0.456505 0.95465 1.157144 0.535044 -0.194837 0
4 0.174141 -0.54295 -0.559499 1.471393 -0.153526 0.95465 -0.864196 -0.493887 -0.195413 0
In [25]:
# Checking our work

plt.figure(figsize=(12, 8))
for i, col in enumerate(df_scaled, start=1):
    plt.subplot(3,3, i)
    sns.histplot(data=df_scaled, x=col, kde=True)
plt.tight_layout()
plt.show()

# This looks much better

Data Visualizations¶

In [26]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define numerical columns based on df_subset columns
numerical_cols = ['Debt', 'Married', 'BankCustomer', 'EducationLevel', 'YearsEmployed',
                  'PriorDefault', 'Employed', 'CreditScore', 'Income']

# Plot histograms
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, start=1):
    plt.subplot(3, 3, i)
    sns.histplot(data=df, x=col, kde=True)
plt.tight_layout()
plt.show()
In [27]:
df_subset.columns
Out[27]:
Index(['Debt', 'Married', 'BankCustomer', 'EducationLevel', 'YearsEmployed',
       'PriorDefault', 'Employed', 'CreditScore', 'Income', 'ApprovalStatus'],
      dtype='object')
In [28]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define categorical columns
categorical_cols = ['Married', 'BankCustomer', 'EducationLevel', 'PriorDefault', 'Employed']

# Plot bar plots for categorical variables
plt.figure(figsize=(12,8))
for i, col in enumerate(categorical_cols, start=1):
    plt.subplot(2,3, i)
    sns.countplot(data=df, x=col)
    plt.title(col)
plt.tight_layout()
plt.show()

Correlation Matrix¶

In [29]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix of the full dataframe
corr_matrix = df.corr()

# Visualize the correlation matrix
plt.figure(figsize=(10, 8))  # Adjust the size as needed
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar=True)
plt.title('Correlation Matrix of Remaining Fields')
plt.show()
In [30]:
print(df_subset.dtypes)
Debt              float64
Married           float64
BankCustomer      float64
EducationLevel    float64
YearsEmployed     float64
PriorDefault      float64
Employed          float64
CreditScore       float64
Income            float64
ApprovalStatus      int64
dtype: object

Modeling¶

In [31]:
# Looking at our subset of data we've decided to include in the models
correlation_matrix = df_subset.corr()
print(correlation_matrix)
                    Debt   Married  BankCustomer  EducationLevel  \
Debt            1.000000 -0.091526     -0.079364        0.023373   
Married        -0.091526  1.000000      0.982257       -0.049977   
BankCustomer   -0.079364  0.982257      1.000000       -0.055812   
EducationLevel  0.023373 -0.049977     -0.055812        1.000000   
YearsEmployed   0.298902 -0.080624     -0.073064        0.037001   
PriorDefault    0.244317 -0.129863     -0.142094        0.109642   
Employed        0.174846 -0.162464     -0.173199        0.128549   
CreditScore     0.271207 -0.106457     -0.112750        0.006978   
Income          0.123121 -0.120065     -0.025170        0.004808   
ApprovalStatus -0.206294  0.194306      0.185134       -0.130434   

                YearsEmployed  PriorDefault  Employed  CreditScore    Income  \
Debt                 0.298902      0.244317  0.174846     0.271207  0.123121   
Married             -0.080624     -0.129863 -0.162464    -0.106457 -0.120065   
BankCustomer        -0.073064     -0.142094 -0.173199    -0.112750 -0.025170   
EducationLevel       0.037001      0.109642  0.128549     0.006978  0.004808   
YearsEmployed        1.000000      0.345689  0.222982     0.322330  0.051345   
PriorDefault         0.345689      1.000000  0.432032     0.379532  0.090012   
Employed             0.222982      0.432032  1.000000     0.571498  0.077652   
CreditScore          0.322330      0.379532  0.571498     1.000000  0.063692   
Income               0.051345      0.090012  0.077652     0.063692  1.000000   
ApprovalStatus      -0.322475     -0.720407 -0.458301    -0.406410 -0.175657   

                ApprovalStatus  
Debt                 -0.206294  
Married               0.194306  
BankCustomer          0.185134  
EducationLevel       -0.130434  
YearsEmployed        -0.322475  
PriorDefault         -0.720407  
Employed             -0.458301  
CreditScore          -0.406410  
Income               -0.175657  
ApprovalStatus        1.000000  
In [32]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
In [33]:
# It looks like married and BankCustomer are perfectly correlated, so we'll dropped "Married" as well.
# Banks are also not allowed to use marital status to make credit decisions
In [36]:
print(y_train.value_counts())
1    306
0    246
Name: ApprovalStatus, dtype: int64
In [37]:
print(y_train.unique())
[0 1]
In [38]:
# Cleaning our dataframe to contain the final values
df_original = df

cols_to_keep = ['Debt', 'BankCustomer', 'EducationLevel', 'YearsEmployed', 'PriorDefault',
                'Employed', 'CreditScore', 'Income', 'ApprovalStatus']

df = df[cols_to_keep]

df
Out[38]:
Debt BankCustomer EducationLevel YearsEmployed PriorDefault Employed CreditScore Income ApprovalStatus
0 0.000 0 12 1.25 1 1 1 0 0
1 4.460 0 10 3.04 1 1 6 560 0
2 0.500 0 10 1.50 1 0 0 824 0
3 1.540 0 12 3.75 1 1 5 3 0
4 5.625 0 12 1.71 1 0 0 0 0
... ... ... ... ... ... ... ... ... ...
685 10.085 2 4 1.25 0 0 0 0 1
686 0.750 0 1 2.00 0 1 2 394 1
687 13.500 2 5 2.00 0 1 1 1 1
688 0.205 0 0 0.04 0 0 0 750 1
689 3.375 0 1 8.29 0 0 0 0 1

690 rows × 9 columns

Decision Tree Classifer¶

In [39]:
from sklearn.tree import DecisionTreeClassifier

# Define features (X) and target variable (y)
X = df.drop('ApprovalStatus', axis=1)
y = df['ApprovalStatus']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Initialize and train the Decision Tree Classifier
clf = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
clf.fit(x_train, y_train)
Out[39]:
DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
In [40]:
import pydotplus
from IPython.display import Image
# Adding this line myself
from sklearn import tree

# Resuming to code that was in the notebook
dot_data = tree.export_graphviz(clf, feature_names=x_train.columns
                , class_names=['approved','not approved'],filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
Out[40]:
In [41]:
print(y_train.value_counts())
1    306
0    246
Name: ApprovalStatus, dtype: int64
In [42]:
clf_2 = DecisionTreeClassifier(criterion='entropy',
                             random_state=0)
clf = clf_2.fit(x_train, y_train)

dot_data = tree.export_graphviz(clf_2, feature_names=x_train.columns
                , class_names=['approved','not approved'],filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

# This clearly overfits the data
Out[42]:
In [43]:
clf_3 = DecisionTreeClassifier(criterion='entropy', max_depth = 5,
                             random_state=0)
clf = clf_3.fit(x_train, y_train)

dot_data = tree.export_graphviz(clf_3, feature_names=x_train.columns
                , class_names=['approved','not approved'],filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
Out[43]:
In [44]:
# Model statistics
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_train_pred = clf.predict(x_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy: ", train_accuracy)

# Predict on the test set
y_test_pred = clf.predict(x_test)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy: ", test_accuracy)

# Calculate precision and recall for the test set
# Note: Adjust the `average` parameter based on your problem (binary, multi-class, etc.)
precision = precision_score(y_test, y_test_pred, average='binary') # for binary classification
recall = recall_score(y_test, y_test_pred, average='binary') # for binary classification
print("Precision: ", precision)
print("Recall: ", recall)
Training Accuracy:  0.8894927536231884
Test Accuracy:  0.8478260869565217
Precision:  0.9117647058823529
Recall:  0.8051948051948052
In [45]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Make predictions
y_pred = clf.predict(x_test)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

Random Forest Model¶

In [46]:
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

param_grid = {
    'n_estimators': np.arange(50, 201, 50),
    'max_depth': [None] + list(np.arange(5, 30, 5)),
    'min_samples_split': np.arange(2, 11, 2),
    'min_samples_leaf': np.arange(1, 11, 2),
    'bootstrap': [True, False],
}

rf = RandomForestClassifier(random_state=0)

# Configure Grid Search
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)

# Fit Grid Search to the data
grid_search.fit(X, y)

# Best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Print the top 5 parameter combinations
print("\nTop 5 parameter combinations:")
# Convert cv_results_ to a DataFrame for easier manipulation
results_df = pd.DataFrame(grid_search.cv_results_)
# Sort the results by 'rank_test_score' and select the top 5
top_5_results = results_df.sort_values(by='rank_test_score').head(5)

# Iterate over the rows of the top 5 results and print the parameters and their corresponding mean test score
for index, row in top_5_results.iterrows():
    print(f"Rank: {row['rank_test_score']}, Mean Test Score: {row['mean_test_score']:.4f}, Params: {row['params']}")
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
Best parameters found:  {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}

Top 5 parameter combinations:
Rank: 1, Mean Test Score: 0.8551, Params: {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}
Rank: 2, Mean Test Score: 0.8536, Params: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Rank: 3, Mean Test Score: 0.8522, Params: {'bootstrap': False, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}
Rank: 3, Mean Test Score: 0.8522, Params: {'bootstrap': False, 'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100}
Rank: 3, Mean Test Score: 0.8522, Params: {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100}
In [47]:
# Still using separate train-test split for a final evaluation:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.2, random_state=0, stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
Out[47]:
((483, 8), (207, 8), (483,), (207,))
In [48]:
X
Out[48]:
Debt BankCustomer EducationLevel YearsEmployed PriorDefault Employed CreditScore Income
0 0.000 0 12 1.25 1 1 1 0
1 4.460 0 10 3.04 1 1 6 560
2 0.500 0 10 1.50 1 0 0 824
3 1.540 0 12 3.75 1 1 5 3
4 5.625 0 12 1.71 1 0 0 0
... ... ... ... ... ... ... ... ...
685 10.085 2 4 1.25 0 0 0 0
686 0.750 0 1 2.00 0 1 2 394
687 13.500 2 5 2.00 0 1 1 1
688 0.205 0 0 0.04 0 0 0 750
689 3.375 0 1 8.29 0 0 0 0

690 rows × 8 columns

In [49]:
rf = RandomForestClassifier(n_estimators=100, random_state=0, min_samples_split= 4, min_samples_leaf=1, max_depth=5, bootstrap=True)
rf.fit(X_train, y_train)
test_score = rf.score(X_test, y_test)
print("Final Test Set Accuracy: ", test_score)
Final Test Set Accuracy:  0.8888888888888888
In [50]:
# variable (feature) importance plot
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

df_scores = pd.DataFrame({'feature': X_train.columns, 'importance': importances, 'std': std})
df_scores = df_scores.sort_values('importance')
print(df_scores)
          feature  importance       std
1    BankCustomer    0.016046  0.020176
2  EducationLevel    0.048985  0.046039
0            Debt    0.077667  0.051225
5        Employed    0.089832  0.121144
3   YearsEmployed    0.094340  0.087963
7          Income    0.132432  0.117848
6     CreditScore    0.133783  0.141691
4    PriorDefault    0.406915  0.216781
In [51]:
import matplotlib.pyplot as plt

ax = df_scores.plot(kind='barh', xerr='std', x='feature', legend=False)
ax.set_ylabel('')
plt.show()
In [52]:
# confusion matrix for train set
dmba.classificationSummary(y_train, rf.predict(X_train))

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Predict on the test set
y_pred = rf.predict(X_train)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:\n", conf_matrix)

#  accuracy
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy: ", accuracy)

#  precision
precision = precision_score(y_train, y_pred, average='binary')
print("Precision: ", precision)

# Calculate recall
recall = recall_score(y_train, y_pred, average='binary')
print("Recall: ", recall)
Confusion Matrix (Accuracy 0.8986)

       Prediction
Actual   0   1
     0 190  25
     1  24 244
Confusion Matrix:
 [[190  25]
 [ 24 244]]
Accuracy:  0.8985507246376812
Precision:  0.9070631970260223
Recall:  0.9104477611940298
In [53]:
# confusion matrix for TEST set
dmba.classificationSummary(y_test, rf.predict(X_test))

# Predict on the test set
y_pred = rf.predict(X_test)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

#  accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

#  precision
precision = precision_score(y_test, y_pred, average='binary')
print("Precision: ", precision)

# Calculate recall
recall = recall_score(y_test, y_pred, average='binary')
print("Recall: ", recall)
Confusion Matrix (Accuracy 0.8889)

       Prediction
Actual   0   1
     0  81  11
     1  12 103
Confusion Matrix:
 [[ 81  11]
 [ 12 103]]
Accuracy:  0.8888888888888888
Precision:  0.9035087719298246
Recall:  0.8956521739130435

Testing the Random Forest for Bias¶

We want to make sure that features like ethnicity, gender are not biased in our data. We excluded these columns for legal reasons, but we need to ensure that our predicted approvals are not biased against these protected classes.

In [54]:
# merge predictions with broader set of original DF values to compare
predictions = rf.predict(X_test)

# Create a DataFrame from indices_test and predictions
predictions_df = pd.DataFrame({'ID': indices_test, 'Prediction': predictions})

# Merge this DataFrame with the original to include sensitive attributes
merged_df = pd.merge(df_original, predictions_df, left_index=True, right_index=True, how='left')
In [55]:
merged_df.head(5)
Out[55]:
Gender Age Debt Married BankCustomer EducationLevel Ethnicity YearsEmployed PriorDefault Employed CreditScore DriversLicense Citizen ZipCode Income ApprovalStatus ID Prediction
0 1 30.83 0.000 1 0 12 7 1.25 1 1 1 0 0 68 0 0 194.0 0.0
1 0 58.67 4.460 1 0 10 3 3.04 1 1 6 0 0 11 560 0 132.0 0.0
2 0 24.50 0.500 1 0 10 3 1.50 1 0 0 0 0 96 824 0 337.0 1.0
3 1 27.83 1.540 1 0 12 7 3.75 1 1 5 1 0 31 3 0 271.0 1.0
4 1 20.17 5.625 1 0 12 7 1.71 1 0 0 0 2 37 0 0 579.0 0.0
In [56]:
# Calculate approval rates by Gender
approval_rates = merged_df.groupby('Gender')['Prediction'].mean()

# Calculate counts of applications by Gender
application_counts = merged_df.groupby('Gender')['Prediction'].count()

# Combine both metrics into a single DataFrame
summary_df = pd.DataFrame({
    'Application Count': application_counts,
    'Approval Rate': approval_rates
})

# Convert approval rate to percentage for better readability
summary_df['Approval Rate'] = summary_df['Approval Rate'] * 100

print("Approval rates and application counts by Gender:")
print(summary_df)
Approval rates and application counts by Gender:
        Application Count  Approval Rate
Gender                                  
0                      69      56.521739
1                     138      54.347826
In [57]:
import scipy.stats as stats

merged_df = merged_df.dropna()

groups = [group['Prediction'].values for name, group in merged_df.groupby('Gender')]

# Conduct ANOVA
anova_result = stats.f_oneway(*groups)

print(f"ANOVA F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4f}")

if anova_result.pvalue < 0.05:
    print("We reject the null hypothesis - there are significant differences in approval rates between groups.")
else:
    print("We fail to reject the null hypothesis - no significant difference in approval rates between groups.")
ANOVA F-statistic: 0.0870, p-value: 0.7683
We fail to reject the null hypothesis - no significant difference in approval rates between groups.
In [58]:
# Calculate approval rates by Gender
approval_rates = merged_df.groupby('Ethnicity')['Prediction'].mean()

# Calculate counts of applications by Gender
application_counts = merged_df.groupby('Ethnicity')['Prediction'].count()

# Combine both metrics into a single DataFrame
summary_df = pd.DataFrame({
    'Application Count': application_counts,
    'Approval Rate': approval_rates
})

# Convert approval rate to percentage for better readability
summary_df['Approval Rate'] = summary_df['Approval Rate'] * 100

print("Approval rates and application counts by Ethnicity:")
print(summary_df)
Approval rates and application counts by Ethnicity:
           Application Count  Approval Rate
Ethnicity                                  
0                         24      54.166667
2                          4      75.000000
3                         60      56.666667
4                          2     100.000000
7                        114      53.508772
8                          3      33.333333
In [59]:
#Running an ANOVA test to see if there are any statistically significant differences between groups.
In [60]:
groups = [group['Prediction'].values for name, group in merged_df.groupby('Ethnicity')]

# Conduct ANOVA
anova_result = stats.f_oneway(*groups)

print(f"ANOVA F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4f}")

if anova_result.pvalue < 0.05:
    print("We reject the null hypothesis - there are significant differences in approval rates between groups.")
else:
    print("We fail to reject the null hypothesis - no significant difference in approval rates between groups.")
ANOVA F-statistic: 0.5969, p-value: 0.7023
We fail to reject the null hypothesis - no significant difference in approval rates between groups.
In [61]:
# Calculate approval rates by Gender
approval_rates = merged_df.groupby('Citizen')['Prediction'].mean()

# Calculate counts of applications by Gender
application_counts = merged_df.groupby('Citizen')['Prediction'].count()

# Combine both metrics into a single DataFrame
summary_df = pd.DataFrame({
    'Application Count': application_counts,
    'Approval Rate': approval_rates
})

# Convert approval rate to percentage for better readability
summary_df['Approval Rate'] = summary_df['Approval Rate'] * 100

print("Approval rates and application counts by Citizenship:")
print(summary_df)
Approval rates and application counts by Citizenship:
         Application Count  Approval Rate
Citizen                                  
0                      192      56.770833
1                        1       0.000000
2                       14      35.714286
In [62]:
groups = [group['Prediction'].values for name, group in merged_df.groupby('Citizen')]

# Conduct ANOVA
anova_result = stats.f_oneway(*groups)

print(f"ANOVA F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4f}")

if anova_result.pvalue < 0.05:
    print("We reject the null hypothesis - there are significant differences in approval rates between groups.")
else:
    print("We fail to reject the null hypothesis - no significant difference in approval rates between groups.")
ANOVA F-statistic: 1.7900, p-value: 0.1696
We fail to reject the null hypothesis - no significant difference in approval rates between groups.

XGBOOST¶

In [72]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y) of the scaled subset
X = df_subset.drop(['ApprovalStatus', 'Married'], axis=1)  # Features
y = df_subset['ApprovalStatus']  # Target variable
print(X.head())

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Display the shapes of the train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
       Debt  BankCustomer  EducationLevel  YearsEmployed  PriorDefault  \
0 -0.956613     -0.559499        1.471393      -0.291083       0.95465   
1 -0.060051     -0.559499        1.004392       0.244190       0.95465   
2 -0.856102     -0.559499        1.004392      -0.216324       0.95465   
3 -0.647038     -0.559499        1.471393       0.456505       0.95465   
4  0.174141     -0.559499        1.471393      -0.153526       0.95465   

   Employed  CreditScore    Income  
0  1.157144    -0.288101 -0.195413  
1  1.157144     0.740830 -0.087852  
2 -0.864196    -0.493887 -0.037144  
3  1.157144     0.535044 -0.194837  
4 -0.864196    -0.493887 -0.195413  
X_train shape: (552, 8)
X_test shape: (138, 8)
y_train shape: (552,)
y_test shape: (138,)
In [73]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
import xgboost as xgb  # Importing the xgboost package

# Create a parameter distribution to use Random Search for hyperparameter tuning
param_dist = {
    "max_depth":randint(1,10),
    "min_child_weight": randint(1,5),
    "gamma": randint(0,2),
    "learning_rate": uniform(0.001, 0.2),
    "n_estimators": randint(50,300),
    "scale_pos_weight": randint(1,10),
    "alpha" : uniform(0,100)
}

xgboost = xgb.XGBClassifier()  # Creating an instance of XGBClassifier

random_search = RandomizedSearchCV(xgboost, param_distributions=param_dist, n_iter=200, cv=5, verbose=1, random_state=0)

random_search.fit(X_train, y_train)

print("Score:", random_search.best_score_)
print("Parameters:", random_search.best_params_)

bestXGB = random_search.best_estimator_
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Score: 0.8532350532350532
Parameters: {'alpha': 47.360041934665745, 'gamma': 0, 'learning_rate': 0.15584673788684333, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 84, 'scale_pos_weight': 1}
In [74]:
print('Training dataset performance:')
dmba.classificationSummary(y_train, bestXGB.predict(X_train))

print('\nTest dataset performance:')
dmba.classificationSummary(y_test, bestXGB.predict(X_test))
Training dataset performance:
Confusion Matrix (Accuracy 0.8533)

       Prediction
Actual   0   1
     0 226  20
     1  61 245

Test dataset performance:
Confusion Matrix (Accuracy 0.8623)

       Prediction
Actual  0  1
     0 58  3
     1 16 61
In [77]:
# Print the precision and recall scores
precision = precision_score(y_test, bestXGB.predict(X_test))
print("Precision: ", precision)

# Calculate recall
recall = recall_score(y_test, bestXGB.predict(X_test))
print("Recall: ", recall)
Precision:  0.953125
Recall:  0.7922077922077922

Logistic Regression¶

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize the logistic regression model
lr = LogisticRegression()

# Defining a parameter grid to search over
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Initialize GridSearchCV with the logistic regression model, parameter grid, and cv parameter
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='roc_auc', error_score='raise')

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best parameters:", grid_search.best_params_)

# Instead of creating a new LogisticRegression model, use grid_search.best_estimator_ directly
# This is the model with the best parameters already fitted to the training data
best_lr = grid_search.best_estimator_

# Use the best model found by GridSearchCV to make predictions on the test set
y_pred = best_lr.predict(X_test)
y_pred_train = best_lr.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy on the test set
print("Training set accuracy:", train_accuracy)
print("Test set accuracy:", test_accuracy)
Best parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Training set accuracy: 0.8677536231884058
Test set accuracy: 0.8768115942028986
In [79]:
from sklearn.metrics import confusion_matrix

# y_pred contains model predictions and y_test contains the true labels
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix:
[[57  4]
 [13 64]]
In [80]:
best_model = grid_search.best_estimator_

# Coefficients of the best model
coefficients = best_model.coef_

feature_names = X.columns

# Print the coefficients for each feature for each class
print("\nCoefficients for each class against each feature:")
for index, class_coefficients in enumerate(coefficients):
    print(f"Class {index}:")
    for feature, coef in zip(feature_names, class_coefficients):
        print(f"{feature}: {coef}")
Coefficients for each class against each feature:
Class 0:
Debt: 0.08387356320869674
BankCustomer: 0.3266207931197453
EducationLevel: -0.13856405553456244
YearsEmployed: -0.29557509653458
PriorDefault: -1.5797169045768271
Employed: -0.21709257430133705
CreditScore: -0.5108983504902503
Income: -1.6569334783397407
In [81]:
# Removing gender, debt, and education level
# Corresponds to indexes 0, 1, and 4
X_modified = np.delete(X_scaled, [0, 1, 4], axis=1)

print(X_modified)

# Recreating the sample
X_train, X_test, y_train, y_test = train_test_split(X_modified, y, test_size = 0.20, random_state = 42, stratify = y)
[[-0.55949891  1.47139336  0.95465038  1.15714435 -0.28810053 -0.19541334]
 [-0.55949891  1.00439179  0.95465038  1.15714435  0.74082993 -0.08785188]
 [-0.55949891  1.00439179  0.95465038 -0.86419641 -0.49388662 -0.03714433]
 ...
 [ 1.79449039 -0.16311214 -1.04750391  1.15714435 -0.28810053 -0.19522126]
 [-0.55949891 -1.33061608 -1.04750391 -0.86419641 -0.49388662 -0.05135781]
 [-0.55949891 -1.09711529 -1.04750391 -0.86419641 -0.49388662 -0.19541334]]
In [82]:
# Converting
lr = LogisticRegression()

# Defining a parameter grid to search over
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Norm used in the penalization
    'solver': ['liblinear']  # Algorithm to use in the optimization problem, 'liblinear' works well for small datasets
}

# Initialize GridSearchCV with the logistic regression model, parameter grid, and cv parameter
# cv=5 specifies the number of folds for cross-validation
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='roc_auc', error_score= 'raise')

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best parameters:", grid_search.best_params_)

# Use the best model found by GridSearchCV to make predictions on the test set
y_pred = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy on the test set
print("Training set accuracy:", train_accuracy)
print("Test set accuracy:", test_accuracy)
Best parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Training set accuracy: 0.8858695652173914
Test set accuracy: 0.7971014492753623
In [83]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix:
[[52  9]
 [19 58]]
In [84]:
#  precision
precision = precision_score(y_test, y_pred, average='binary')
print("Precision: ", precision)

# Calculate recall
recall = recall_score(y_test, y_pred, average='binary')
print("Recall: ", recall)
Precision:  0.8656716417910447
Recall:  0.7532467532467533

Neural Network¶

Note: please see separate submission for the neural network script